{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr’\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/data/home/marmot/camtrap/PyCharm/CameraTraps-benchmark')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "\n",
    "from PIL import Image\n",
    "from tqdm import tqdm\n",
    "\n",
    "from data_management.cct_json_utils import CameraTrapJsonUtils, IndexedJsonDb\n",
    "from visualization.visualization_utils import plot_stacked_bar_chart, render_db_bounding_boxes, resize_image"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exclude insect and distant birds from SS bbox annotations\n",
    "\n",
    "A small number of images with insects and distant birds still have bounding boxes around these. Manually filtering these out to form the `20190903` version of `SnapshotSerengetiBboxes_20190409.json`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('/beaver_disk/camtrap/ss_season1/original/SnapshotSerengetiBboxes_20190409.json') as f:\n",
    "    original = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "82938"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "147026"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "[{'id': 1, 'name': 'animal'},\n",
       " {'id': 2, 'name': 'person'},\n",
       " {'id': 3, 'name': 'group'},\n",
       " {'id': 4, 'name': 'vehicle'}]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "{'contributor': 'SMB',\n",
       " 'date_created': '2019-04-14',\n",
       " 'description': 'Reprocessed bounding box annotations for Snapshot Serengeti seasons 1 to 6.',\n",
       " 'version': '20190409',\n",
       " 'year': 2018}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(original['images'])\n",
    "len(original['annotations'])\n",
    "original['categories']\n",
    "original['info']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "ss_dir = '/home/marmot/camtrap/mnt/snapshot-serengeti-v2/SER'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 147026/147026 [00:00<00:00, 1013764.18it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "614"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "area_threshold_pixels = 300\n",
    "upper_area_threshold_pixels = 400\n",
    "\n",
    "problem_annos = []\n",
    "num_group = 0\n",
    "\n",
    "for a in tqdm(original['annotations']):\n",
    "    if a['category_id'] == 3:\n",
    "        num_group += 1\n",
    "        continue\n",
    "    \n",
    "    assert a['category_id'] in [1, 2]\n",
    "    \n",
    "    _, _, w, h = a['bbox']\n",
    "    area = w * h\n",
    "    if area > area_threshold_pixels and area < upper_area_threshold_pixels:\n",
    "        problem_annos.append(a)\n",
    "len(problem_annos)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "area_threshold_pixels, num of problem_annos\n",
    "\n",
    "100, 575\n",
    "\n",
    "200, 1226\n",
    "\n",
    "300, 1226 + 601 = 1827\n",
    "\n",
    "400, 1827 + 614 = 2441"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 614/614 [02:34<00:00,  3.23it/s]\n"
     ]
    }
   ],
   "source": [
    "rendered_problem_annos = []\n",
    "\n",
    "for i in tqdm(problem_annos):\n",
    "    image = Image.open(os.path.join(ss_dir, i['image_id'] + '.JPG'))\n",
    "    category = i['category_id']\n",
    "    render_db_bounding_boxes([i['bbox']] , [category], image, original_size=None, label_map=None, thickness=4)\n",
    "    image = resize_image(image, 1000)\n",
    "    rendered_problem_annos.append((i['id'], i['image_id'], image))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "614"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(rendered_problem_annos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for anno_id, image_id, im in rendered_problem_annos[550:]:\n",
    "    print(anno_id)\n",
    "    print(image_id)\n",
    "    im\n",
    "    print('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  0%|          | 0/147026 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'id': 'jGmjS1533032672368', 'category_id': 1, 'image_id': 'S1/J04/J04_R2/S1_J04_R2_PICT0968', 'bbox': [271.9229798279782, 345.93088361620994, 10.652652818003354, 14.577314382531071]}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "for a in tqdm(original['annotations']):\n",
    "    if a['id'] == 'jGmjS1533032672368':\n",
    "        print(a)\n",
    "        break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Update database JSON"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "# these are annotation IDs for annotation entries to delete\n",
    "with open('/beaver_disk/camtrap/ss_season1/manual_filter/SS_annos_to_delete.csv') as f:\n",
    "    annos_to_del = f.read().splitlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "380"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "353"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(annos_to_del)\n",
    "annos_to_del = [a for a in annos_to_del if a != '']\n",
    "len(annos_to_del)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "# these are images where all annotations of bbox smaller than 400 sq pixel can be deleted for all such boxes on the image.\n",
    "with open('/beaver_disk/camtrap/ss_season1/manual_filter/SS_annos_to_delete_image_with_small_boxes.csv') as f:\n",
    "    images_annos_to_del = f.read().splitlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "213"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "127"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(images_annos_to_del)\n",
    "images_annos_to_del = [a for a in images_annos_to_del if a != '']\n",
    "len(images_annos_to_del)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "353"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "124"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annos_to_del = set(annos_to_del)\n",
    "images_annos_to_del = set(images_annos_to_del)\n",
    "\n",
    "len(annos_to_del)\n",
    "len(images_annos_to_del)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Exclude annotations manually filtered out from the new version of annotation entries."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 42%|████▏     | 62446/147026 [00:00<00:00, 624448.69it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Originally had 147026 annotation entries.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 147026/147026 [00:00<00:00, 612427.62it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "After the exclusions, now have 146359 annotation entries.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "print('Originally had {} annotation entries.'.format(len(original['annotations'])))\n",
    "\n",
    "new_annotations = []\n",
    "num_bboxes_excluded = 0\n",
    "\n",
    "for a in tqdm(original['annotations']):\n",
    "    # if in list of annotation IDs, exclude this entry\n",
    "    if a['id'] in annos_to_del:\n",
    "        num_bboxes_excluded += 1\n",
    "        continue\n",
    "    \n",
    "    if a['category_id'] == 3:\n",
    "        new_annotations.append(a)\n",
    "    else:\n",
    "        assert a['category_id'] in [1, 2]\n",
    "\n",
    "        _, _, w, h = a['bbox']\n",
    "        area = w * h\n",
    "        if area < 400 and a['image_id'] in images_annos_to_del:\n",
    "            continue\n",
    "        else:\n",
    "            new_annotations.append(a)\n",
    "print('After the exclusions, now have {} annotation entries.'.format(len(new_annotations)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "667"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "147026 - 146359  # number of bboxes excluded"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Make no changes to the image entries. If an image no longer has annotation entries, it is confirmed empty."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_info = {\n",
    "    'contributor': 'Sara Beery, this version updated by Siyu Yang',\n",
    "    'date_created': '2019-09-03',\n",
    "    'description': 'Reprocessed bounding box annotations for Snapshot Serengeti seasons 1 to 6, with remaining small insect and distant bird bboxes smaller than 400 sq pixel excluded manually.',\n",
    "    'version': '20190903',\n",
    "    'year': 2019\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_db = {\n",
    "    'info': new_info,\n",
    "    'categories': original['categories'],\n",
    "    'annotations': new_annotations,\n",
    "    'images': original['images']\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "146359"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "82938"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(new_db['annotations'])\n",
    "len(new_db['images'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_db = CameraTrapJsonUtils.order_db_keys(new_db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('/beaver_disk/camtrap/ss_season1/original/SnapshotSerengetiBboxes_20190903.json', 'w') as f:\n",
    "    json.dump(new_db, f, indent=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
